#Question 1library(readr)#set working directory and load csv files into the global environmentsetwd("/Users/mitchellanderson/Desktop/usc.health.dat.sci")PM2022<-read.csv("PM_data_2022.csv")PM2002<-read.csv("PM_data_2002.csv")#EDA#check the dimensions, headers, footers, variable names and variable types. Check the distribution of the key variable we are analyzing (PM2.5)colnames(PM2022)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-6.700 4.100 6.800 8.414 10.700 302.500
summary(PM2002$Daily.Mean.PM2.5.Concentration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.00 12.00 16.12 20.50 104.30
The exploratory data analysis reveals the column names, which remain identical and consist of the same names and header quantities. The 2002 data has 15,976 observations while the 2022 data has 59,918 observations. After performing statistical analysis, is is shown that the 2002 data has a mean of 16.12, median of 12, a max of 104.30, a min of 0, 1st quartile of 7, and 3rd quartile of 104.30. The 2022 data had a mean of 8.414, median of 6.8, min of -6.7, max of 302.500, 1st quartile of 4.1, and 3rd quartile of 10.7.
#Question 2##search for how to combine data frames in r##create new variable called 'year', 2022 for one dataset and 2002. Stratify only by the two years, and not by month and day##New variable names, (anything used to answer the final question)PM_all <-rbind(PM2002, PM2022)# Convert Date column to Date format (from mm/dd/yyyy)PM_all$Date <-as.Date(PM_all$Date, format ="%m/%d/%Y")# Create a new column "date" that stratifies only by yearPM_all$date <-format(PM_all$Date, "%Y")#remove rows with missing coordinatesPM_all <- PM_all[!is.na(PM_all$Site.Latitude) &!is.na(PM_all$Site.Longitude), ]
#Question 3##For leaflet, see code from lecture. subset only californialibrary(leaflet)library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
#Question 4#call the combined datasetdata_check<-PM_all%>%#stratify by yeargroup_by(date)%>%summarize(obs_tot=n(),#find negative PM2.5 observationsneg_obs=sum(Daily.Mean.PM2.5.Concentration <0, na.rm =TRUE), #find missing PM2.5 observationsmissing_obs=sum(is.na(Daily.Mean.PM2.5.Concentration)),#remove grouping variables from output.groups="drop") %>%#calculate proportions of missing and negative values from 2002 and 2022 datasetsmutate(neg_proportion=neg_obs/obs_tot,missing_proportion=missing_obs/obs_tot )
Question 4 creates a new table titled “data_check” which displays the negative and missing values from the daily mean PM 2.5 concentrations, stratified by year. The table shows that neither data set had missing observations and that the 2022 data had 215 missing observations with a proportion of 0.0036.
#Question 5##3 Levels: State level across California: how have they changed? (mean median, SD),by county, then just LA- office hour notes#summarize() and arrange() functionslibrary(ggplot2)##Statewide filtersum_statewide <- PM_all %>%group_by(date) %>%#stat summaries, for min, max, mean, and mediansummarize(PM25_state_min =min(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),PM25_state_max =max(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),PM25_state_mean =mean(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),PM25_state_median =median(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),.groups="drop" )print(sum_statewide)
#make boxplotggplot(PM_all, aes(x =factor(date), y = Daily.Mean.PM2.5.Concentration, fill =factor(date))) +geom_boxplot(outlier.size =0.9, outlier.alpha =0.7) +labs(title ="2002 & 2022 California Daily Mean PM2.5 Levels ",x ="Year",y ="Daily Mean PM2.5 Levels",fill ="Year")+theme_minimal(base_size=20)
##countysum_countywide <- PM_all%>%group_by(County,date)%>%summarize(PM25_county_min=min(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),PM25_county_max=max(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),PM25_county_mean=mean(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),PM25_county_median=median(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),.groups="drop" )#filter and print counties with top ten highest mean pm2.5 levels in 2002 and 2022print(sum_countywide)
# A tibble: 98 × 6
County date PM25_county_min PM25_county_max PM25_county_mean
<chr> <chr> <dbl> <dbl> <dbl>
1 Alameda 2002 1.9 61.6 14.3
2 Alameda 2022 -0.7 35.5 8.20
3 Butte 2002 1 88 14.8
4 Butte 2022 -0.6 42.8 6.19
5 Calaveras 2002 2 40 9.9
6 Calaveras 2022 0 25.9 6.04
7 Colusa 2002 1 57 11.7
8 Colusa 2022 0.6 37 7.61
9 Contra Costa 2002 2 76.7 15.1
10 Contra Costa 2022 0.9 37.3 8.24
# ℹ 88 more rows
# ℹ 1 more variable: PM25_county_median <dbl>
#summarize median pm25 levels for each county and yearsum_countywide <- PM_all %>%group_by(County,date) %>%summarize(PM25_county_median =median(Daily.Mean.PM2.5.Concentration, na.rm =TRUE),.groups="drop")#top 10 medianstop_counties <- sum_countywide%>%group_by(County)%>%summarize(max_median_PM25=max(PM25_county_median, na.rm=TRUE))%>%arrange(desc(max_median_PM25))%>%slice_head(n=10)%>%pull(County)#filter to top ten countiesPM25_10 <- PM_all %>%filter(County %in% top_counties)#Box plots for top 10 counties with greatest daily mean pm25 levelsggplot(PM25_10, aes(x =factor(date),y = Daily.Mean.PM2.5.Concentration,fill =factor(date)))+geom_boxplot(outlier.size =0.9, outlier.alpha =0.7)+facet_wrap(~ County, scales ="free_y")+labs(title ="Top 10 Counties with Highest Daily Mean PM2.5 in 2002 and 2022",x ="Year",y ="Top 10 Mean PM2.5 Levels by County",fill ="Year")+theme_minimal(base_size =12)
##Los Angeles#stratify by LA and yearsLA_sites <- PM_all %>%filter(Local.Site.Name =="Los Angeles-North Main Street", date %in%c(2002, 2022)) #stat summary for LAsum_citywide <- LA_sites %>%group_by(date) %>%summarize(PM25_city_mean=mean(Daily.Mean.PM2.5.Concentration, na.rm=TRUE),PM_25_city_median=median(Daily.Mean.PM2.5.Concentration, na.rm=TRUE),PM25_city_min=min(Daily.Mean.PM2.5.Concentration, na.rm=TRUE),PM25_city_max=max(Daily.Mean.PM2.5.Concentration, na.rm=TRUE) )print(sum_citywide)
#create boxplotggplot(LA_sites,aes(x =factor(date),y = Daily.Mean.PM2.5.Concentration,fill =factor(date)))+geom_boxplot(outlier.size=0.9,outlier.alpha=0.7)+labs(title="Los Angeles Daily Mean PM2.5 Levels 2002 vs 2022",x ="Year",y ="Daily Mean PM2.5 Levels",fill="Year")+theme_minimal(base_size=14)
The statistical summaries across the state, between counties, and in the city of Los Angeles reveal multiple characteristics of the PM 2.5 levels from 2002 to 2022. At the state level, there was a decrease in daily mean PM 2.5 concentrations. At the county level, the top ten counties that had the greatest mean concentrations between the two years were Fresno, Kern, Kings, Los Angeles, Merced, Orange, Riverside, San Diego, Tulare, and Ventura. In the city of Los Angeles alone, there was a decrease in the daily mean levels of PM 2.5 from 2002 to 2022.